# Build GLD-23K Federated Dataset & Preprocess

import os
import tarfile
import urllib.request
import tensorflow_federated as tff
import time
import pandas as pd
import numpy as np
from PIL import Image
import time
import shutil

cache_dir = '/directory/for/full_landmarks'
data_dir = '/directory/for/landmark_images/'
csv_path = '/directory/for/GLD_train_csv'
csv_path_test = '/directory/for/GLD_test_csv'
base_dir = '/directory/for/GLD_raw_images'
save_dir_train = '/directory/for/train_partition'
save_dir_test = '/directory/for/test_partition'

def load_data(df,data_dict,save_dir_train, train = True):
    '''
    Preprocessing and test allocation of GLD-23K data (landmarks subset)
    '''
    count = 0
    for index, row in df.iterrows():
        # Allocation of user id (client)
        if train:
            user_id = row['user_id']
        else: 
            user_id = count % 233
            count += 1
        image_id = row['image_id']
        label = row['class']
        a, b, c = image_id[0], image_id[1], image_id[2]
        file_path = os.path.join(base_dir, a, b, c, f"{image_id}.jpg")

        # Preprocessing
        image = Image.open(file_path)
        image = image.resize((224, 224))
        image_array = np.array(image).astype(np.float32) / 255.0
        image_array = (image_array - mean) / std

        # Append image and label to client dictionary
        if user_id not in data_dict:
            data_dict[user_id] = {'x': [], 'y': []}
        data_dict[user_id]['x'].append(image_array)
        data_dict[user_id]['y'].append(label)

    # Save for loading
    for user_id, data in data_dict.items():
        x = np.array(data['x'])
        y = np.array(data['y']).reshape(-1, 1)
        np.savez(os.path.join(save_dir_train, f"user_{user_id}.npz"), x=x, y=y)

# Download landmarks csv
t = time.time()
os.makedirs(cache_dir, exist_ok=True)
gldv2_train, gldv2_test = tff.simulation.datasets.gldv2.load_data(
    gld23k=True,
    cache_dir=cache_dir
)
print(f"Time elapsed: {time.time()-t} seconds.")

# Download uncorrupted images
base_url = 'https://s3.amazonaws.com/google-landmark/'
train_files = [f'train/images_{i:03d}.tar' for i in range(500)]
os.makedirs(data_dir, exist_ok=True)
for file in train_files:
    url = base_url + file
    filename = file.split('/')[-1]
    filepath = os.path.join(data_dir, filename)
    if not os.path.exists(filepath):
        print(f'Downloading {filename}...')
        urllib.request.urlretrieve(url, filepath)
    if filepath.endswith(".tar"):
        print(f'Extracting {filename}...')
        with tarfile.open(filepath) as tar:
            tar.extractall(path=data_dir)

start_time = time.time()
df = pd.read_csv(csv_path)
df_test = pd.read_csv(csv_path_test)
if os.path.exists(save_dir_train):
    shutil.rmtree(save_dir_train)
os.makedirs(save_dir_train)
if os.path.exists(save_dir_test):
    shutil.rmtree(save_dir_test)
os.makedirs(save_dir_test)

# Use ImageNet statistics for preprocessing
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
data_dict = {}

# Read csv to build GLD-23K clients manually
load_data(df,{},save_dir_train)
print("Data processing and saving completed successfully for train data.")
print(f"Total time: {-start_time + time.time()} seconds elapsed.")
print(("Data processing and saving starting for test data."))
start_time = time.time()
load_data(df_test,{},save_dir_test, train = False)
print("Data processing and saving completed successfully for test data.")
print(f"Total time: {-start_time + time.time()} seconds elapsed.")
